1 Settings

# Define custom function for checking installation of packages and loading them

install_and_load_package <- function(package) {

    # Check whether package is already installed and if not, install it

    if (!require(package, character.only = T)) {

        install.packages(package, dependencies = T)

    }

    # Load specified packages

    require(package, character.only = T)

}

# Specify packages needed for analysis in character vector

packages <- c("conflicted",
              "here",
              "foreach",
              "doMC",
              "httr",
              "rvest",
              "pins",
              "tidyverse",
              "tidymodels",
              "naniar",
              "mice",
              "scales",
              "lubridate",
              "janitor",
              "philentropy",
              "Rtsne",
              "dendextend",
              "ggdag",
              "ggridges",
              "broom",
              "caret",
              "caretEnsemble",
              "reticulate",
              "keras",
              "tfestimators",
              "tensorflow",
              "tfdatasets",
              "PerformanceAnalytics",
              "fitdistrplus",
              "DT",
              "plotly",
              "RColorBrewer",
              "prettydoc")

# Install and load needed packages (checks prior package installation)

lapply(packages, install_and_load_package)

# Load `knitr` package for creating Rmarkdown reports

library(knitr)

# Load custom functions

source("/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/MyFunctions/summary_stats.R")

source("/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/MyFunctions/plot_histogram.R")

source("/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/MyFunctions/plot_heatmap.R")

# Conflicted: hierarchy in case of conflicting function names from different packages

conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("first", "dplyr")
conflict_prefer("last", "dplyr")
conflict_prefer("lag", "dplyr")
conflict_prefer("between", "dplyr")
conflict_prefer("layout", "plotly")
conflict_prefer("train", "caret")
conflict_prefer("cluster", "caret")
conflict_prefer("all_numeric", "recipes")

# Set default plotting setting for `ggplot2`

theme_set(theme_light() +
              theme(legend.text = element_text(),
                    plot.title  = element_text(face = "bold"),
                    axis.line   = element_line(size = 0.75),
                    panel.grid.major = element_line(size = 0.05),
                    panel.grid.minor = element_line(size = 0.05)))

# Color settings for plotting

col_palette_red    <- brewer.pal(n = 9, name = "OrRd")

col_palette_yellow <- brewer.pal(n = 9, name = "YlOrRd")

col_palette_green  <- brewer.pal(n = 9, name = "YlGn")

col_palette_blue   <- brewer.pal(n = 9, name = "PuBu")

col_palette_grey   <- brewer.pal(n = 9, name = "Greys")

# Parallel computing settings:
# Using maximum number of available cores

n_CPU_cores <- detectCores()

registerDoMC(cores = n_CPU_cores)

2 Competition Aim

The House Prices - Advanced Regression Techniques Kaggle competition uses root mean squared logarithmic error (RMSLE) as loss metric. The aim is to predict the target variable SalePrice for 1’459 houses in the test dataset, based on 79 provided features, as accurately as possible and to minimise RMSLE. This amounts to a classic regression problem. \[\begin{equation} L(y, \hat{y}) = \sqrt{\frac{1}{N} \sum_{i = 0}^{N} ( log(y_{i} + 1) - log(\hat{y}_{i} + 1) )^2} \end{equation}\]

Define custom root mean squared logarithmic (RMSLE) loss function (to be used with caret)

RMSLE_summary <- function(data, lev = NULL, model = NULL) {

    # Compute root mean squared logarithmic error (RMSLE)

    RMSLE <- sqrt( mean( ( log(data$obs + 1) - log(data$pred + 1) )^2, na.rm = T ) )

    names(RMSLE) <- "RMSLE"

    # Compute standard mean squared error (RMSE)

    RMSE <- sqrt( mean( ( data$obs - data$pred )^2, na.rm = T ) )

    names(RMSE) <- "RMSE"

    # Combine loss metrics for output

    v_metric_results <- c(RMSLE, RMSE)

    # Return loss metrics from function

    return(v_metric_results)
}

3 Data Input

After having established the competition aim, the next step is to read in the data. We start with reading in the training data (from the CSV file provided by Kaggle).

df_train <-
    read_csv(file = "/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/HousePrices/data/house-prices-advanced-regression-techniques/train.csv")

# df_train <-
#     read_csv(file = here("data/house-prices-advanced-regression-techniques/train.csv"))

Next, we read in the test data.

df_test <-
    read_csv(file = "/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/HousePrices/data/house-prices-advanced-regression-techniques/test.csv")

# df_test <-
#     read_csv(file = here("data/house-prices-advanced-regression-techniques/test.csv"))

Finally, Kaggle also provides a sample submission file, showing how the target variable predictions should look like.

df_sample_submission <-
    read_csv(file = "/Users/Matthieu/Desktop/ML and DS Resources/R/Kaggle/HousePrices/data/house-prices-advanced-regression-techniques/sample_submission.csv")

# df_sample_submission <-
#     read_csv(file = here("data/house-prices-advanced-regression-techniques/sample_submission.csv"))

The competition target variable is SalePrice, which is a continuous variable. Hence, we’re dealing with a standard regression prediction problem.

4 Data Wrangling

I always start by combining the training and test data into one single data.frame. This eases exploratory data analysis (EDA) performed in the next step.

df_data <- df_train %>%
    bind_rows(df_test)

We also transform all character variables into factors, for better data handling.

df_data <-
    df_data %>%
    mutate(across(where(is.character), as.factor))

Some of the features have non-standard names, which we quickly account for here.

df_data <-
    df_data %>%
    rename(FirstFlrSF          = `1stFlrSF`,
           SecondFlrSF         = `2ndFlrSF`,
           ThidSsnPorch        = `3SsnPorch`)

5 Exploratory Data Analysis (EDA)

The next step is to get an overview of the provided dataset. This is known as exploratory data analysis (EDA), which enables to get an overview of the dataset and the relationships between the provided features and the target variable. We start with traditional and basic EDA and later proceed to a more sophisticated EDA with unsupervised machine learning techniques. An efficient initial step to get a first glimpse is to compute summary statistics of all 81 variables, including target variable and features, in the data set.

df_data_summary_stats <-
    df_data %>%
    summary_stats(round = T, digits = 2)

df_data_summary_stats %>%
    datatable(options = list(pageLength = 30))

Let’s quickly count the number of numeric and categorical variables we have in the dataset.

df_data_summary_stats %>%
    count(Type) %>%
    adorn_totals(where = "row") %>%
    datatable()

Next, we check for missing values in the variables. We use the computed summary statistics and simply arrange by the variables with the most missing values.

df_data_summary_stats %>%
    arrange(desc(`NA Perc.`)) %>%
    datatable()

We visualise this, by creating a graph with the number of missing values per feature (features with no missing values are excluded).

p_missing_values <-
    df_data_summary_stats %>%
    filter(NAs > 0) %>%
    ggplot(aes(x     = fct_reorder(Variable, `NA Perc.`),
               y     = `NA Perc.`,
               fill  = fct_reorder(Variable, `NA Perc.`))) +
    geom_col(alpha = 0.9) +
    geom_text(aes(y     = `NA Perc.` + 3,
                  label = paste0(`NA Perc.`, "%")),
              size = 2.5) +
    coord_flip() +
    scale_fill_viridis_d(name = "Features") +
    labs(title    = "Which Features Have the Most Missing Values?",
         subtitle = "Kaggle House Prices Competition",
         x        = "Feature",
         y        = "Percentage of Values Missing")

p_missing_values %>%
    ggplotly(width  = 1000,
             height = 600)

The four variables PoolQC, MiscFeature, Alley, and Fence have missing values in more than 80% of observations, while FireplaceQu and SalePrice have around 50% missing values. SalePrice is the target variable however, so this is not worrisome. For the remaining variables with many NAs, missing value imputation might be a sensible strategy at a later point.

Let’s also inspect the number of outcomes per categorical variable. From the summary statistics we know that Neighborhood has 25 unique levels, the most of any variable in the dataset, while Street and CentralAir have only 2 unique levels. The table of all outcomes per categorical variable are shown below, with the variables in columns and the outcomes (or levels) in the rows.

df_data_categorical_counts <-
    df_data %>%
    select(where(is.factor)) %>%
    pivot_longer(cols      = everything(),
                 names_to  = "Variable",
                 values_to = "Outcome") %>%
    group_by(Variable) %>%
    count(Outcome) %>%
    ungroup() %>%
    arrange(Variable, desc(n)) %>%
    pivot_wider(id_cols     = Outcome,
                names_from  = "Variable",
                values_from = "n")

df_data_categorical_counts %>%
    datatable(options = list(pageLength = 30))

Next, we turn to the target variable SalePrice and plot its univariate distribution…

p_hist_target <-
    df_data %>%
    plot_histogram(x        = SalePrice,
                   bins     = 100,
                   title    = "Target Variable SalePrice",
                   subtitle = "Histogram of Statistical Distribution") +
    scale_x_continuous(labels = dollar_format(big.mark = "'"),
                       breaks = pretty_breaks(n = 10))
p_hist_target %>%
    ggplotly(width  = 1000,
             height = 600)
## Warning: Removed 1459 rows containing non-finite values (stat_bin).
## Warning: Removed 1459 rows containing non-finite values (stat_density).

…as well as against the index variable.

p_scatter_target <-
    df_data %>%
    ggplot(aes(x = Id, y = SalePrice)) +
    geom_point(col   = col_palette_blue[7],
               alpha = 0.5) +
    scale_y_continuous(labels = dollar_format(big.mark = "'"),
                       breaks = breaks_pretty(n = 10)) +
    scale_x_continuous(breaks = pretty_breaks(n = 10))
p_scatter_target %>%
    ggplotly(width  = 1000,
             height = 600)

SalePrice is right-skewed, as is clearly visible in the univariate histogram, with house prices going as high as USD 7.55^{5} in the right tail. However, the largest density of houses is sold for prices between around USD 100’000 and USD 300’000.

We now proceed to look at multivariate relationships in the data and start with the case of simple linear relations. Arguably, the relationships between the variables are best spotted with plots. Hence, we create a scatter plot with the target variable SalePrice against all numerical features and include a linear regression line.

suppressWarnings(
    p_scatter_target_vs_numerical_features <-
        df_data %>%
        select(where(is.numeric)) %>%
        pivot_longer(cols      = -SalePrice,
                     names_to  = "Variables",
                     values_to = "Values") %>%
        ggplot(aes(y = SalePrice, x = Values, col = Variables)) +
        geom_point(alpha = 0.5) +
        geom_smooth(formula = y ~ x,
                    method  = "lm",
                    se      = T,
                    col     = col_palette_red[7]) +
        facet_wrap(facets = . ~ Variables,
                   scales = "free_x") +
        scale_y_continuous(labels = dollar_format(big.mark = "'"),
                           breaks = breaks_pretty(n = 6)) +
        scale_color_viridis_d() +
        labs(title    = "Target vs. Numerical Features",
             subtitle = "Scatter Plot with Regression Line",
             x        = NULL) +
        theme(strip.background = element_rect(fill = col_palette_grey[6]))
)
suppressWarnings(
    p_scatter_target_vs_numerical_features %>%
        ggplotly(width  = 2000,
                 height = 1200)
)

Linear relations with the target variable SalePrice are detectable for the features 1stFlrSF, BsmtFinSF1, Fireplaces, FullBath, GarageArea, GarageCars, GarageYrBlt, GrLivArea, LotArea, LotFrontage, MasVnrArea, OpenPorchSF, OverallQual, TotalBsmtSF, TotRmsAbvGrd, WoodDeckSF, and YearBuilt.

We recreate the same plot, but instead of the linear regression line, we now use a generalised additive regression (GAM) line.

suppressWarnings(
    p_scatter_target_vs_numerical_features <-
        df_data %>%
        select(where(is.numeric)) %>%
        pivot_longer(cols      = -SalePrice,
                     names_to  = "Variables",
                     values_to = "Values") %>%
        ggplot(aes(y = SalePrice, x = Values, col = Variables)) +
        geom_point(alpha = 0.5) +
        geom_smooth(method  = "gam",
                    se      = F,
                    col     = col_palette_red[7]) +
        facet_wrap(facets = . ~ Variables,
                   scales = "free_x") +
        scale_y_continuous(labels = dollar_format(big.mark = "'"),
                           breaks = breaks_pretty(n = 6)) +
        scale_color_viridis_d() +
        labs(title    = "Target vs. Features",
             subtitle = "Scatter Plot with Regression Line",
             x        = NULL) +
        theme(strip.background = element_rect(fill = col_palette_grey[6]))
)
suppressWarnings(
    p_scatter_target_vs_numerical_features %>%
        ggplotly(width  = 2000,
                 height = 1200)
)

After considering the non-linear GAM model, it seems that only the features 1ndFlrSF, 2ndFlrSF (which wasn’t clearly related in the linear case), BsmtFinSF1, GarageYrBlt, GrLivArea, MasVnrArea, OverallQual, and TotalBsmtSF are indicative of the target SalePrice. This illustrates the importance of looking at non-linear relationships in the data. Only taking into account linear relations is all too often misleading.

Let’s repeat the same exercise with the categorical features and boxplots.

suppressWarnings(
    p_boxplot_target_vs_categorical_features <-
        df_data %>%
        select(!(where(is.numeric)),
               SalePrice) %>%
        pivot_longer(cols      = -SalePrice,
                     names_to  = "Variables",
                     values_to = "Values") %>%
        ggplot(aes(y = SalePrice, x = Values, col = Variables)) +
        geom_boxplot() +
        facet_wrap(facets = . ~ Variables,
                   scales = "free_x") +
        scale_y_continuous(labels = dollar_format(big.mark = "'"),
                           breaks = breaks_pretty(n = 6)) +
        scale_color_viridis_d() +
        labs(title    = "Target vs. Categorical Features",
             subtitle = "Boxplots",
             x        = NULL) +
        theme(strip.background = element_rect(fill = col_palette_grey[6]))
)
suppressWarnings(
    p_boxplot_target_vs_categorical_features %>%
        ggplotly(width  = 2000,
                 height = 1200)
)
# TODO: Ridge plot

# df_data %>%
#     select(!(where(is.numeric)),
#            SalePrice) %>%
#     pivot_longer(cols      = -SalePrice,
#                  names_to  = "Variables",
#                  values_to = "Values") %>%
#     ggplot(aes(x = SalePrice, y = Variables, fill = Variables)) +
#     geom_density_ridges() +
#     # theme_ridges() +
#     facet_wrap(facets = . ~ Variables,
#                scales = "free_x") +
#     scale_x_continuous(labels = dollar_format(big.mark = "'"),
#                        breaks = breaks_pretty(n = 6)) +
#     scale_fill_viridis_d() +
#     labs(title    = "Target vs. Categorical Features",
#          subtitle = "Boxplots",
#          x        = NULL) +
#     theme(strip.background = element_rect(fill = col_palette_grey[6]))

Compute linear Pearson correlation matrix for target variable and features, after turning categorical variables into numeric features (quick and dirty solution)

df_correlation_matrix_Pearson <-
    df_data %>%
    mutate(across(where(is.factor), as.numeric)) %>%
    # select(where(is.numeric)) %>%
    select(SalePrice, everything()) %>%
    cor(method = "pearson",
        use    = "pairwise.complete.obs") %>%
    as_tibble() %>%
    mutate(Variable = colnames(.)) %>%
    select(Variable, everything())
## Warning in cor(., method = "pearson", use = "pairwise.complete.obs"): the
## standard deviation is zero

Visualise Pearson correlation matrix in heat map

p_correlation_matrix_Pearson <-
    df_correlation_matrix_Pearson %>%
    plot_heatmap(var         = Variable,
                 title       = "Features Correlating with Sale Price?",
                 subtitle    = "Pearson Linear Correlation Matrix",
                 legend_name = "Pearson Correlation")

p_correlation_matrix_Pearson %>%
    ggplotly(width  = 2000,
             height = 1200)

Compute Spearman ranking correlation matrix for target variables and features

suppressWarnings(
    df_correlation_matrix_Spearman <-
        df_data %>%
        mutate(across(where(is.factor), as.numeric)) %>%
        select(SalePrice, everything()) %>%
        cor(method = "spearman",
            use    = "pairwise.complete.obs") %>%
        as_tibble() %>%
        mutate(Variable = colnames(.)) %>%
        select(Variable, everything())
)
# Compute Kendall's Tau ranking correlation matrix for target variables and features

# TODO: Takes ages under current settings

# df_correlation_matrix_Kendall <-
#     df_data %>%
#     mutate(across(is.factor, as.numeric)) %>%
#     # select(where(is.numeric)) %>%
#     select(SalePrice, everything()) %>%
#     cor(method = "kendall",
#         use    = "pairwise.complete.obs") %>%
#     as_tibble() %>%
#     mutate(Variable = colnames(.)) %>%
#     select(Variable, everything())

# df_correlation_matrix_Kendall_filtered <-
#     df_correlation_matrix_Kendall %>%
#     select(Variable, SalePrice) %>%
#     filter(SalePrice >= 0.5) %>%
#     arrange(desc(SalePrice)) %>%
#     rename(SalePriceKendall = SalePrice)

Compare the different correlation measures

df_correlation_matrix_Pearson_long <-
    df_correlation_matrix_Pearson %>%
    pivot_longer(cols      = -Variable,
                 names_to  = "Variable_2",
                 values_to = "Pearson_Correlation")

df_correlation_matrix_Spearman_long <-
    df_correlation_matrix_Spearman %>%
    pivot_longer(cols      = -Variable,
                 names_to  = "Variable_2",
                 values_to = "Spearman_Correlation")

df_correlation_matrices <-
    df_correlation_matrix_Pearson_long %>%
    full_join(df_correlation_matrix_Spearman_long,
              by = c("Variable", "Variable_2"))

df_correlation_matrices %>%
    filter(Variable_2 == "SalePrice",
           Pearson_Correlation >= 0.5 | Spearman_Correlation >= 0.5) %>%
    mutate(Diff = Spearman_Correlation - Pearson_Correlation) %>%
    arrange(desc(Spearman_Correlation), desc(Pearson_Correlation)) %>%
    mutate(across(where(is.numeric), ~ round(., digits = 3))) %>%
    datatable(options = list(pageLength = 20))
# FIXME: Takes a very long time!

# df_data %>%
#     select(-Id) %>%
#     select(where(is.numeric)) %>%
#     select(SalePrice, 2:12) %>%
#     chart.Correlation(histogram = T, method = "pearson", pch = 19)
#
# df_data %>%
#     select(-Id) %>%
#     select(where(is.numeric)) %>%
#     select(SalePrice, 13:24) %>%
#     chart.Correlation(histogram = T, method = "pearson", pch = 19)
#
# df_data %>%
#     select(-Id) %>%
#     select(where(is.numeric)) %>%
#     select(SalePrice, 24:37) %>%
#     chart.Correlation(histogram = T, method = "pearson", pch = 19)

As a last step, let’s encode all categorical features as numerical ones. We use on-hot encoding here (an alternative to prevent the dummy-variable trap and its multicollinearity problem for linear models is standard dummy encoding).

# TODO: One-hot or dummy encoding?

model_dummy   <- dummyVars(formula  = ~ .,
                           data     = df_data,
                           sep      = "_",
                           fullRank = F)

df_data_dummy <- predict(model_dummy,
                         newdata = df_data) %>%
    as_tibble()

6 Unsupervised Machine Learning

We now turn to the second part of EDA, with the help of unsupervised machine learning. The key characteristic of unsupervised machine learning is that there is no target variable or label provided. Solely on the basis of the provided features, we try to gauge relationships in the data. Typical examples of unsupervised machine learning algorithms include k-Means, hierarchical clustering, or neural network autoencoders, as well as dimensionality-reduction algorithms such as principal component analysis (PCA) or stochastic neighborhood embeding (SNE). However, we start simple by normalising our dummyfied data.

df_data_normalised <-
    df_data_dummy %>%
    mutate(across(everything(), ~ scale(., center = T, scale = T)))

# df_data_dummy %>%
#     preProcess(method = c("center", "scale"))

To perform hierarchical clustering, we need a distance matrix between all features. A common choice is the one-dimensional Euclidean distance matrix. However, the Euclidean distance results in very bad results with the given dataset and we need a distance metric of higher dimension for this problem. Hence, we calculate the Manhatten distance (other distance metrics include Minkowski, Chebyshev, Kullback-Leibler, or Jensen-Shannon).

# dist_data_numerical_normalised <-
#     dist(df_data_normalised,
#          method = "euclidean")

dist_data_numerical_normalised <-
    dist(df_data_normalised,
         method = "manhattan")

# system.time(
#     dist_data_numerical_normalised <-
#     dist(df_data_normalised,
#          method = "minkowski",
#          p      = 5)
#     )

# system.time(
#     dist_data_numerical_normalised <-
#         distance(df_data_normalised,
#                  method = "kullback-leibler",
#                  p      = 5)
# )

# philentropy::kullback_leibler_distance()

We can then perform hierarchical clustering, using the complete linkage method.

hc_data_numerical_normalised <-
    hclust(dist_data_numerical_normalised,
           method = "complete")

Next, we compute the assignment vector with a k of five (number of clusters, arbitrary choice here).

clusters_k5 <-
    cutree(hc_data_numerical_normalised,
           k = 5)

Create a new data frame storing these results.

df_hc_clustering <-
    df_data_dummy %>%
    mutate(clusters = factor(clusters_k5))

df_hc_clustering %>%
    count(clusters) %>%
    datatable()

We quickly visualise the assigned clusters.

suppressWarnings(
    df_hc_clustering %>%
        # pivot_longer(cols      = -c(SalePrice, clusters),
        #              names_to  = "Variables",
        #              values_to = "Values") %>%
        ggplot(aes(x = clusters, y = SalePrice, col = clusters)) +
        geom_boxplot() +
        scale_y_continuous(labels = dollar, breaks = pretty_breaks()) +
        labs(title = "Hierarchical Clustering, Based on Manhattan Distance Matrix",
             subtitle = "Kaggle House Price Competition")
)
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

# Create a dendrogram from the hclust variable and plot it

# dend_hc_data_numerical_normalised <-
#     hc_data_numerical_normalised %>%
#     as.dendrogram() %>%
#     color_branches(k = 5)

# p <- dend_hc_data_numerical_normalised %>%
#     ggplot() +
#     labs(title = "Dendrogram")
# # theme(axis.text.x = element_text(angle = 90))

# p %>% ggplotly(width  = 1000,height = 600

As shown above, the assigned separation performed with hierarchical clustering is only mildly satisfactory. Hence, we instead proceed with another technique, which is t-distributed stochastic neighbor embedding (t-SNE). t-SNE is a dimensionality-reduction technique, similar to PCA, but has the advantage of being able to deal with non-linear relations in the data. It’s disadvantage however is that it cannot deal with missing values.

# TODO: Check whether more features with NAs need to be excluded (e.g. all SalePrices with NAs
# are excluded)

df_data_tSNE <-
    df_data_dummy %>%
    select(-contains("Pool"),
           -contains("MiscFeature"),
           -contains("Alley"),
           -contains("Fence"),
           -contains("FireplaceQu"),
           -contains("LotFrontage")) %>%
    filter(across(everything(), ~ !is.na(.)))

set.seed(123)

l_tSNE_output <-
    Rtsne(df_data_tSNE,
          dims         = 2,                   # Number of dimensions in low-dimensional space
          initial_dims = ncol(df_data_tSNE),  # Number of dimensions to retain in initial PCA step
          perplexiy    = 30,                  # Estimate of number of close neighbors to each original point
          theta        = 0.5,
          pca          = T,
          normalize    = T,
          max_iter     = 1000,
          verbose      = T)
## Performing PCA
## Read the 1338 x 270 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.23 seconds (sparsity = 0.079865)!
## Learning embedding...
## Iteration 50: error is 62.038678 (50 iterations in 0.18 seconds)
## Iteration 100: error is 54.414442 (50 iterations in 0.18 seconds)
## Iteration 150: error is 53.073869 (50 iterations in 0.19 seconds)
## Iteration 200: error is 52.488086 (50 iterations in 0.19 seconds)
## Iteration 250: error is 52.076051 (50 iterations in 0.20 seconds)
## Iteration 300: error is 0.737355 (50 iterations in 0.19 seconds)
## Iteration 350: error is 0.518483 (50 iterations in 0.18 seconds)
## Iteration 400: error is 0.453935 (50 iterations in 0.17 seconds)
## Iteration 450: error is 0.425334 (50 iterations in 0.19 seconds)
## Iteration 500: error is 0.412460 (50 iterations in 0.18 seconds)
## Iteration 550: error is 0.402922 (50 iterations in 0.18 seconds)
## Iteration 600: error is 0.394589 (50 iterations in 0.17 seconds)
## Iteration 650: error is 0.388339 (50 iterations in 0.18 seconds)
## Iteration 700: error is 0.384546 (50 iterations in 0.17 seconds)
## Iteration 750: error is 0.381335 (50 iterations in 0.17 seconds)
## Iteration 800: error is 0.377010 (50 iterations in 0.17 seconds)
## Iteration 850: error is 0.372271 (50 iterations in 0.17 seconds)
## Iteration 900: error is 0.367776 (50 iterations in 0.16 seconds)
## Iteration 950: error is 0.364231 (50 iterations in 0.16 seconds)
## Iteration 1000: error is 0.364497 (50 iterations in 0.16 seconds)
## Fitting performed in 3.54 seconds.

Plot Kullback-Leibler divergence, i.e. the loss, between data points in the original high-dimensional dataset and in the new lower dimensional space

df_tSNE_KL_iter <-
    tibble(KL = l_tSNE_output$itercosts) %>%
    mutate(index = row_number())

df_tSNE_KL_iter_min <-
    df_tSNE_KL_iter %>%
    filter(KL == min(KL, na.rm = T))

df_tSNE_KL_final <-
    tibble(KL = l_tSNE_output$costs) %>%
    mutate(index = row_number())

df_tSNE_KL_iter %>%
    ggplot(aes(x = index, y = KL)) +
    geom_line() +
    geom_point() +
    geom_point(x    = df_tSNE_KL_iter_min$index,
               y    = df_tSNE_KL_iter_min$KL,
               col  = col_palette_blue[7],
               size = 5,
               pch  = 8) +
    labs(title    = "Total Kullback-Leibler Divergence per t-SNE Iteration",
         subtitle = "Kaggle House Price Competition") +
    scale_x_continuous(breaks = pretty_breaks()) +
    scale_y_continuous(breaks = pretty_breaks())

df_tSNE_KL_final %>%
    ggplot(aes(x = index, y = KL)) +
    geom_line() +
    geom_point() +
    labs(title    = "Final Kullback-Leibler Divergence of t-SNE per Feature",
         subtitle = "Kaggle House Price Competition") +
    scale_x_continuous(breaks = pretty_breaks()) +
    scale_y_continuous(labels = label_number(),
                       breaks = pretty_breaks())

Store the first two coordinates and plot them

df_tSNE <- tibble(Dim_x  = l_tSNE_output$Y[ , 1],
                  Dim_y  = l_tSNE_output$Y[ , 2],
                  target = cut(df_data_tSNE$SalePrice, breaks = 5))

Plot the two computed dimensions from the t-SNE algorithm.

df_tSNE %>%
    ggplot(aes(x = Dim_x, y = Dim_y, color = target, pch = target)) +
    geom_point() +
    labs(title    = "t-SNE of House Price Data",
         subtitle = "Non-Linear Dimensionality Reduction") +
    scale_color_viridis_d()

# geom_text(aes(label = target))
# theme(legend.position = "none")

7 Feature Engineering

Feature Engineering creates additional features from the given variables, by combining them in innovative ways. For numerical feature engineering, blending with linear and non-linear combinations are used most often. For categorical features, string detection and extraction may be beneficial if we’re dealing with text data. In general, feature engineering is were domain knowledge of the data at hand is needed. It is probably the most creative aspect of machine learning competitions. Let’s create TotalHouseArea as an additional feature by combining the areas of the basement TotalBsmtSF, 1st FirstFlrSF and 2nd floors SecondFlrSF.

# TODO: Also include for deep learning model

df_data <-
    df_data %>%
    mutate(TotalHouseArea = TotalBsmtSF + FirstFlrSF + SecondFlrSF)

Create TotalBathsN from BsmtFullBath, BsmtHalfBath, FullBath, and HalfBath

df_data <-
    df_data %>%
    mutate(TotalBathsN = BsmtFullBath + 0.5 * BsmtHalfBath + FullBath + 0.5 * HalfBath)

Resplit train and test data with new features according to Id variable

df_train <-
    df_data %>%
    filter(Id %in% df_train$Id)

df_test <-
    df_data %>%
    filter(Id %in% df_test$Id)

df_train_dummy <-
    df_data_dummy %>%
    filter(Id %in% df_train$Id)

df_test_dummy <-
    df_data_dummy %>%
    filter(Id %in% df_test$Id)

8 Missing Value Imputation

Missing value imputation is often a part of data pre-processing in machine learning problems. It is especially beneficial for ML algorithms which cannot handle missing data, since otherwise too much data needs to be discarded. In general, there are different reasons for missing values, i.e. missing completely at random (MCAR), missing at random (MAR), and missing not at random (MNAR). Data missing non randomly may for example occur when survey answers of survey participants with specific characteristics are systematically excluded. Let’s start by exploring which values are missing in the dataset more in detail.

df_data %>%
    vis_miss(cluster = F)

df_data %>%
    vis_miss(cluster = T, sort_miss = T)

Very useful plot!

df_data %>%
    select(-SalePrice) %>%
    gg_miss_upset()

suppressWarnings(
    df_data %>%
        gg_miss_fct(fct = SalePrice)
)
## Warning: Removed 82 rows containing missing values (geom_tile).

The simplest way of imputing missing values is to just replace NAs by the mean value of the respective variable. This is a univariate imputation method, based on one single feature at a time. Let’s quickly do this in the following. For categorical features, the mode value, i.e. the category occuring most frequently is filled in (as compared to the mean for numeric features).

df_data_imp_mean <-
    df_data %>%
    mutate(across(everything(), impute_mean))

df_data_imp_median <-
    df_data %>%
    mutate(across(everything(), impute_median))


rec <- recipe(formula = SalePrice ~ .,
              data    = df_data_dummy)

rec_impute_median <- rec %>%
    # step_knnimpute(-SalePrice) %>%
    step_medianimpute(-SalePrice)

model_impute_median <- prep(rec_impute_median,
                            training = df_data_dummy)

df_data_dummy_imputed_median <-
    bake(model_impute_median,
         new_data = df_data_dummy,
         everything())

Resplit train and test data with new features according to Id variable

df_train <-
    df_data %>%
    filter(Id %in% df_train$Id)

df_test <-
    df_data %>%
    filter(Id %in% df_test$Id)

df_train_dummy <-
    df_data_dummy_imputed_median %>%
    filter(Id %in% df_train$Id)

df_test_dummy <-
    df_data_dummy_imputed_median %>%
    filter(Id %in% df_test$Id)

9 Model Fitting

Set train control for caret models We’re dealing with a regression problem, so classProbs needs to be set to FALSE

ctrl <- trainControl(method          = "repeatedcv",   # "LGOCV", "adaptive_cv"
                     # adaptive        = list(),
                     number          = 10,             # Number of folds
                     repeats         = 5,              # Number of repeats (complete sets of folds)
                     classProbs      = F,              # Regression problem
                     summaryFunction = RMSLE_summary,  # custom loss metric,
                     search          = "random",       # random hyperparameter search # "grid"
                     preProcOptions  = list(thresh = 0.95, ICAcomp = 3, k = 5, freqCut = 95/5,
                                            uniqueCut = 10, cutoff = 0.9),
                     verboseIter     = T,
                     allowParallel   = T)              # Allow for parallel computing

Setting up a baseline model, serving as comparison to later models

system.time(
    model_glm <- train(SalePrice ~ .,
                       data           = df_train,
                       method         = "glm",
                       # tuneGrid       = df_glmnet_grid,
                       na.action      = na.pass,
                       preProcess     = c("nzv", "medianImpute", "center", "scale"),  # knnImpute
                       trControl      = ctrl,
                       metric         = "RMSLE")
)
## Aggregating results
## Fitting final model on full training set
##    user  system elapsed 
##  39.926   6.174   9.357

Show GLM results

model_glm
## Generalized Linear Model 
## 
## 1460 samples
##   82 predictor
## 
## Pre-processing: median imputation (109), centered (109), scaled (109),
##  remove (139) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1313, 1313, 1313, 1315, 1313, 1314, ... 
## Resampling results:
## 
##   RMSLE      RMSE    
##   0.1588255  36237.95

Plot variable importance

p_model_glm_varImp <-
    model_glm %>%
    varImp() %>%
    ggplot() +
    labs(title = "Variable Importance of Generalised Linear Regression Model")

p_model_glm_varImp %>%
    ggplotly(width  = 1000,
             height = 600)

Train extreme gradient boosting machine XGBoost

# system.time(
#     model_xgb <- train(SalePrice ~ .,
#                        data           = df_train,
#                        method         = "xgbTree",
#                        tuneLength     = 10,
#                        na.action      = na.pass,
#                        preProcess     = c("nzv", "medianImpute", "center", "scale"),  # knnImpute
#                        trControl      = ctrl,
#                        metric         = "RMSLE")
# )

Plot XGBoost model overview

# ggplot(model_xgb)

Variable importance plot

# varImp(model_xgb)

10 Model Validation

Validate model, based on loss metric (RMSLE).

v_prediction_model_glm <-
    predict(model_glm, newdata = df_train, na.action = na.pass)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# v_prediction_model_xgb <-
#     predict(model_xgb, newdata = df_train, na.action = na.pass)

df_validation <-
    df_train %>%
    transmute(Id,
              obs  = SalePrice,
              pred = v_prediction_model_glm)

df_validation %>%
    RMSLE_summary() %>%
    round(digits = 3)
##     RMSLE      RMSE 
##     0.148 28402.109
df_validation %>%
    datatable()